goal

Here, we want to recreate Figure 1A of Lax et al 2017.

In addition to looking at the floor samples like they did, we will look at each sample type separately. In Figure 1A, they use room floor and station floor samples.

libs/input/output

libraries

library(phyloseq); packageVersion("phyloseq")
## [1] '1.44.0'
# library(Biostrings); packageVersion("Biostrings")
library(ggplot2); packageVersion("ggplot2")
## [1] '3.4.2'
library(dplyr); packageVersion("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## [1] '1.1.2'
library(ggrepel); packageVersion("ggrepel")
## [1] '0.9.3'

counts data

DADA2_part2 = dir("../..", pattern="DADA2_part2", full.names = T, include.dirs = T)
countsFile = file.path(DADA2_part2, "output", "asv-counts.txt")
counts = read.delim2(countsFile, row.names = 1)
dim(counts)
## [1] 3008  952
counts = counts[rowSums(counts) > 0,]
dim(counts)
## [1] 2915  952
head(row.names(counts))
## [1] "ERR1459793" "ERR1459883" "ERR1460576" "ERR1460577" "ERR1460578"
## [6] "ERR1460579"
head(colnames(counts))
## [1] "ASV.1" "ASV.2" "ASV.3" "ASV.4" "ASV.5" "ASV.6"

metadata

metaReduced = dir("../..", pattern="metadata_PRJEB14474", full.names = T, include.dirs = T)
metaFile = file.path(metaReduced, "output", "PRJEB14474_SraRunTable_reduced.txt")
meta = read.delim2(metaFile)
row.names(meta) = meta$ID
message("Read metadata from: ", metaFile)
## Read metadata from: ../../02_review_metadata_PRJEB14474/output/PRJEB14474_SraRunTable_reduced.txt

We will need to describe days as being “pre-opening” or “post-opening”. This information is held is “study_day”. Make a new variable that is a simple category instead of a numeric.

meta$opening = "before opening"
meta$opening[meta$study_day > 0] = "after opening"

We should have meta data for all samples we had data for…and for some samples that have been filtered out. Limit the meta data to only include samples we have in the data.

inData = row.names(counts)
meta = meta[inData, ]
dim(meta)
## [1] 2915   14

taxonomy

assignTax = dir("../..", pattern="assign_ASV_taxonomy", full.names = T, include.dirs = T)
taxaFile = file.path(assignTax, "output/silva/asvTaxaTable_silva-v138.txt")
taxa = read.delim2(taxaFile, row.names = "asvID")
# drop the ASV column
taxaLim = taxa[,grep("ASV", names(taxa), invert = T, value = T)]
taxaMat = as.matrix(taxaLim)

direct output

# tmpDir = "../temp"
# suppressWarnings(dir.create(tmpDir, recursive = T))
# 
outDir = "../output"
suppressWarnings(dir.create(outDir, recursive = T))

Ordination function

In the legend, Figure 1A is described as:

  1. Principal coordinate analysis (PCoA) of all floor samples based on weighted UniFrac distance and colored by whether they were taken before or after the hospital’s opening.
# alt syntax: 
# doPCoA = function(counts=counts, meta=meta, taxaMat=taxaMat, sampleType="Floor"){

doPCoA = function(sampleType="Floor"){
  sampleSet = meta[meta$SAMPLE_TYPE %in% sampleType, "ID"]
  countslim = counts[sampleSet,]
  
  ps <- phyloseq(otu_table(counts[sampleSet,], taxa_are_rows=FALSE), 
                 sample_data(meta[sampleSet,]), 
                 tax_table(taxaMat))
  ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
  ord <- ordinate(ps.prop, method="PCoA", distance="bray") # TODO: switch to unifrac to match paper
  po = plot_ordination(ps.prop, ord, shape="surcat", color="opening", title=paste0("bray PCoA - ", paste0(sampleType, collapse=", ")))
  return(list(ord=ord, plotA=po))
}
floor = doPCoA(sampleType="Floor")
floor$plotA

ggsave(file.path(outDir, "floor_bray_PCoA.png"))
## Saving 7 x 5 in image

Each type

Do the same for each sample type.

sampleTypes = unique(meta$SAMPLE_TYPE)
plotList = as.list(sampleTypes)
names(plotList) = sampleTypes

for (type in sampleTypes){
  aplot = doPCoA(sampleType=type)
  plotList[[type]] = aplot
  show(aplot$plotA)
}

Save images to a pdf.

pdf(file.path(outDir, "allTypes_bray_PCoA.pdf"))

for (type in sampleTypes){
  show(plotList[[type]]$plotA)
}

dev.off()
## quartz_off_screen 
##                 2

specific groups

a = doPCoA(sampleType=c("Floor", "Corridor Floor"))
show(a$plotA)

ggsave(file.path(outDir, "floors_bray_PCoA.png"), a$plotA)
## Saving 7 x 5 in image
b = doPCoA(sampleType=c("Countertop", "Computer Mouse", "Station Phone"))
show(b$plotA)

ggsave(file.path(outDir, "fig1-B.png"), b$plotA)
## Saving 7 x 5 in image

post trends

Now I am curious. Corridor Floor samples separate nicely along axis 1 between pre- and post-opening. What do I see if I plot the axis 1 value against the study day? Do I see leap from one stable state to another? Do I see a gradual change over time?

do.plot2 = function(type, axis="Axis.1"){
  y = plotList[[type]]$ord$vectors[,axis]
  df = data.frame(pcoa = y,
                  study_day = meta[names(y),"study_day"],
                  opening = meta[names(y),"opening"])
  dfPost = df[df$study_day > 0, ]
  ct = cor.test(x = dfPost$study_day, y = dfPost$pcoa)
  plot2 = ggplot(df, aes(x=study_day, y=pcoa, col=opening)) +
    geom_point() +
    geom_vline(xintercept = 0,
               linetype = 1, 
               col="grey") +
    ylab(axis) +
    ggtitle(type)
  if (ct$p.value < 0.05) {
    annot = paste("post-opening cor: ", round(ct$estimate, 2), "; pval:", round(ct$p.value, 4))
    plot2 = plot2 + 
      ggtitle(type, subtitle = annot)
  }
  show(plot2)
  return(plot2)
}
L2 = list(
  do.plot2("Corridor Floor"),
  do.plot2("Floor", "Axis.1"),
  do.plot2("Floor", "Axis.2"),
  do.plot2("Cold Tap Water"),
  do.plot2("Countertop"),
  do.plot2("Computer Mouse"),
  do.plot2("Station Phone"),
  do.plot2("blank control")
)

pdf(file.path(outDir, "study-day.v.pcoAx.pdf"))

for (p2 in L2){
  show(p2)
}

dev.off()
## quartz_off_screen 
##                 2

each vs blank and/or glove

Do each sapmle type alongside the blank controls.

doPCoA2 = function(sampleType="Floor"){
  sampleSet = meta[meta$SAMPLE_TYPE %in% sampleType, "ID"]
  countslim = counts[sampleSet,]
  
  ps <- phyloseq(otu_table(counts[sampleSet,], taxa_are_rows=FALSE), 
                 sample_data(meta[sampleSet,]), 
                 tax_table(taxaMat))
  ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
  ord <- ordinate(ps.prop, method="PCoA", distance="bray") # TODO: switch to unifrac to match paper
  po = plot_ordination(ps.prop, ord, shape="surcat", color="SAMPLE_TYPE", title=paste0("bray PCoA - ", paste0(sampleType, collapse=", ")))
  return(list(ord=ord, plotB=po))
}

eachVsBlank = list()
for (type in sampleTypes){
  eachVsBlank[[type]] = doPCoA2(sampleType=c("blank control", type))
  show(eachVsBlank[[type]]$plotB)
}

pdf(file.path(outDir, "eachType_vBlank_PCoA.pdf"))

for (type in sampleTypes){
  show(eachVsBlank[[type]]$plotA)
}
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
dev.off()
## quartz_off_screen 
##                 2
eachVsGlove = list()
for (type in sampleTypes){
  eachVsGlove[[type]] = doPCoA2(sampleType=c("Glove from Glove Box", type))
  show(eachVsGlove[[type]]$plotB)
}

pdf(file.path(outDir, "eachType_vGlove_PCoA.pdf"))

for (type in sampleTypes){
  show(eachVsGlove[[type]]$plotA)
}
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
dev.off()
## quartz_off_screen 
##                 2
eachVsBlankGlove = list()
for (type in sampleTypes){
  eachVsBlankGlove[[type]] = doPCoA2(sampleType=c("blank control", "Glove from Glove Box", type))
  show(eachVsBlankGlove[[type]]$plotB)
}

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggrepel_0.9.3   dplyr_1.1.2     ggplot2_3.4.2   phyloseq_1.44.0
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.3            xfun_0.39               bslib_0.4.2            
##  [4] rhdf5_2.44.0            Biobase_2.60.0          lattice_0.21-8         
##  [7] rhdf5filters_1.12.1     vctrs_0.6.2             tools_4.3.0            
## [10] bitops_1.0-7            generics_0.1.3          biomformat_1.28.0      
## [13] stats4_4.3.0            parallel_4.3.0          tibble_3.2.1           
## [16] fansi_1.0.4             highr_0.10              cluster_2.1.4          
## [19] pkgconfig_2.0.3         Matrix_1.5-4            data.table_1.14.8      
## [22] S4Vectors_0.38.1        lifecycle_1.0.3         GenomeInfoDbData_1.2.10
## [25] farver_2.1.1            compiler_4.3.0          stringr_1.5.0          
## [28] textshaping_0.3.6       Biostrings_2.68.1       munsell_0.5.0          
## [31] codetools_0.2-19        permute_0.9-7           GenomeInfoDb_1.36.0    
## [34] htmltools_0.5.5         sass_0.4.6              RCurl_1.98-1.12        
## [37] yaml_2.3.7              pillar_1.9.0            crayon_1.5.2           
## [40] jquerylib_0.1.4         MASS_7.3-58.4           cachem_1.0.8           
## [43] vegan_2.6-4             iterators_1.0.14        foreach_1.5.2          
## [46] nlme_3.1-162            tidyselect_1.2.0        digest_0.6.31          
## [49] stringi_1.7.12          reshape2_1.4.4          labeling_0.4.2         
## [52] splines_4.3.0           ade4_1.7-22             fastmap_1.1.1          
## [55] grid_4.3.0              colorspace_2.1-0        cli_3.6.1              
## [58] magrittr_2.0.3          survival_3.5-5          utf8_1.2.3             
## [61] ape_5.7-1               withr_2.5.0             scales_1.2.1           
## [64] rmarkdown_2.21          XVector_0.40.0          multtest_2.56.0        
## [67] igraph_1.4.3            ragg_1.2.5              evaluate_0.21          
## [70] knitr_1.42              IRanges_2.34.0          mgcv_1.8-42            
## [73] rlang_1.1.1             Rcpp_1.0.10             glue_1.6.2             
## [76] BiocGenerics_0.46.0     rstudioapi_0.14         jsonlite_1.8.4         
## [79] R6_2.5.1                Rhdf5lib_1.22.0         plyr_1.8.8             
## [82] systemfonts_1.0.4       zlibbioc_1.46.0